## Loading required package: s20x
在为时间序列数据建模的时候,分解成四个成分可以帮助理解和量化数据随时间的变化 例如:我们的模型可以如下:
\[Y_t = T_t + C_t + S_T + R_t\]
data("airpass.df")
passengers.ts = ts(airpass.df$passengers, start = 1949, frequency = 12)
plot(passengers.ts, main = "", ylab = "")
passengers.stl = stl(passengers.ts, s.window = "periodic")
plot(passengers.stl)
abline(v=1950:1960, lty=2)
plot(passengers.ts, main = "", ylab = "")
remainder = passengers.stl$time.series[,3]
data("mening.df")
mening.ts = ts(mening.df$mening, frequency=12, start=c(1990,1))
plot(mening.ts, main="")
abline(v=1991:2001, lty = 2)
# airpass.df = within(airpass.df, {month = factor(rep(month.abb, 12), levels = month.abb); time = 1:144})
# lines(passengers~month, data = airpass.df)
解读:
data(beer.df)
beer.ts = ts(beer.df, frequency = 12, start = c(1971,7))
plot(beer.ts, main = "US Beer Production, July 1970 to June 1978")
abline(v = 1971:1978, lty = 2)
解读:
passengers.ts = ts(airpass.df$passengers, start = 1949, frequency = 12)
plot(passengers.ts, main = "", ylab="")
abline(v=1950:1960, lty=2)
解读:
log.passengers.ts = log(passengers.ts)
plot(log.passengers.ts, main = "", ylab="")
abline(v=1950:1960, lty=2)
分解图:
passengers.stl = stl(log.passengers.ts, s.window = "periodic")
plot(passengers.stl)
title("Decomposition of log(Passengers)")
* 趋势存在少量的非线性,最多表明数据中可能有较弱的波动。
要解决的问题:
解决的方法:
# 代码段2 ----
passengers.ts = ts(log(airpass.df$passengers), start = 1949, frequency = 12)
plot(passengers.ts, main = "Data with Seasonal Component")
passengers.stl = stl(passengers.ts, s.window = "periodic")
sa.passengers = passengers.ts - passengers.stl$time.series[, "seasonal"]
plot(sa.passengers, main = "Data with seasonal component removed")
\[S_t = \alpha Y_t + (1 - \alpha) S_{t-1}\]
HoltWinters(data.df, alpha = x, beta = FALSE, gamma = FALSE) \(\alpha\) 由x来制定,beta 和 gamma 用在有趋势和季节性的时候data("larain.df")
LArain.ts = ts(rev(larain.df$LA.Rain) * 25.4, frequency = 1, start = 1877)
plot(LArain.ts, xlab = "Season", ylab = "Total rainfall (mm)")
LArain.hw = HoltWinters(LArain.ts, alpha = 0.5, beta = FALSE, gamma = FALSE)
plot(LArain.hw, lwd = 2, col.predicted = "blue", main = "alpha = 0.5")
LArain.hw = HoltWinters(LArain.ts, alpha = 0.95, beta = FALSE, gamma = FALSE)
plot(LArain.hw, lwd = 2, col.predicted = "blue", main = "alpha = 0.95")
LArain.hw = HoltWinters(LArain.ts, alpha = 0.75, beta = FALSE, gamma = FALSE)
plot(LArain.hw, lwd = 2, col.predicted = "blue", main = "alpha = 0.75")
LArain.hw = HoltWinters(LArain.ts, alpha = 0.25, beta = FALSE, gamma = FALSE)
plot(LArain.hw, lwd = 2, col.predicted = "blue", main = "alpha = 0.25")
LArain.hw = HoltWinters(LArain.ts, alpha = 0.05, beta = FALSE, gamma = FALSE)
plot(LArain.hw, lwd = 2, col.predicted = "blue", main = "alpha = 0.05")
在predict 中调用 n.ahead 参数
LArain.hw1 = HoltWinters(LArain.ts, beta = FALSE, gamma = FALSE)
LArain.pred = predict(LArain.hw1, n.ahead = 5)
plot(LArain.hw1, predicted.values = LArain.pred, lwd = 2, col.predicted = "blue")
LArain.pred = predict(LArain.hw1, n.ahead = 15, prediction.interval = TRUE)
plot(LArain.hw1, predicted.values = LArain.pred, lwd = 2, col.predicted = "blue")
hist(resid(LArain.hw1))
log.LArain.ts = log(LArain.ts)
LArain.hw2 = HoltWinters(log.LArain.ts, beta = FALSE, gamma = FALSE)
plot(LArain.hw2)
log.LArain.pred = predict(LArain.hw2, n.ahead = 15, prediction.interval = TRUE)
exp(log.LArain.pred)
## Time Series:
## Start = 1943
## End = 1957
## Frequency = 1
## fit upr lwr
## 1943 343.7816 929.1857 127.1928
## 1944 343.7816 931.5911 126.8644
## 1945 343.7816 933.9966 126.5377
## 1946 343.7816 936.4020 126.2126
## 1947 343.7816 938.8074 125.8893
## 1948 343.7816 941.2129 125.5675
## 1949 343.7816 943.6184 125.2474
## 1950 343.7816 946.0239 124.9289
## 1951 343.7816 948.4296 124.6121
## 1952 343.7816 950.8353 124.2968
## 1953 343.7816 953.2411 123.9831
## 1954 343.7816 955.6470 123.6709
## 1955 343.7816 958.0531 123.3604
## 1956 343.7816 960.4593 123.0513
## 1957 343.7816 962.8657 122.7438
log.pass.ts = ts(log(airpass.df$passengers), frequency = 12, start = 1949)
log.pass.hw = HoltWinters(log.pass.ts)
log.pass.pred = predict(log.pass.hw, n.ahead = 30)
plot(log.pass.hw, log.pass.pred, col.predicted = "blue", lwd = 2)
让R来决定最优的 \(\alpha, \beta , \gamma\)
log.pass.pred = predict(log.pass.hw, n.ahead = 30, prediction.interval = TRUE)
plot(log.pass.hw, log.pass.pred, col.predicted = "blue", col.intervals = "red", lwd = 2)
最简单的就是检查残差的自相关函数图(ACF) 首先要明确用回归模型分析时间序列的策略:
plot(log.passengers.ts)
# 第二步:添加季节项 ----
airpass.df = within(airpass.df, { month = factor(rep(month.abb, 12), levels = month.abb); time = 1:144})
pass.fit = lm(log(airpass.df$passengers)~month, data = airpass.df)
plot(pass.fit$residuals)
# plot(pass.fit$residuals, type = "l")
lines(airpass.df$time, residuals(pass.fit)) # 用线将点连接起来
lines(lowess(airpass.df$time, residuals(pass.fit))) # 用离散点平滑后画出的曲线,用来了解趋势再好不过了。
# lowess returns a list containing components x and y which give the coordinates of the smooth. The smooth can be added to a plot of the original points with the function lines: see the examples. 在这里显然不用了,因为散点图已经可以看出明显的趋势了。
# 第三部:添加时间项 ----
pass.fit1 = lm( log(airpass.df$passengers)~time + month, data = airpass.df )
anova(pass.fit1)
## Analysis of Variance Table
##
## Response: log(airpass.df$passengers)
## Df Sum Sq Mean Sq F value Pr(>F)
## time 1 25.1233 25.1233 7143.582 < 2.2e-16 ***
## month 11 2.2843 0.2077 59.047 < 2.2e-16 ***
## Residuals 131 0.4607 0.0035
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p-value可以看成指示性的,因为不满足独立性假设。
plot( residuals(pass.fit1)~fitted(pass.fit1))
lines(lowess(fitted(pass.fit1), residuals(pass.fit1)))
二次方的?
pass.fit2 = lm( log(airpass.df$passengers)~time + month + I(time^2), data = airpass.df )
anova(pass.fit2)
## Analysis of Variance Table
##
## Response: log(airpass.df$passengers)
## Df Sum Sq Mean Sq F value Pr(>F)
## time 1 25.1233 25.1233 10813.900 < 2.2e-16 ***
## month 11 2.2843 0.2077 89.386 < 2.2e-16 ***
## I(time^2) 1 0.1587 0.1587 68.307 1.409e-13 ***
## Residuals 130 0.3020 0.0023
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(residuals(pass.fit2)~fitted(pass.fit2))
lines(lowess(fitted(pass.fit2), residuals(pass.fit2)))
plot(residuals(pass.fit2), type = "l") #link the points lower case L
lines(lowess(fitted(pass.fit2), residuals(pass.fit2)))
* 这回看起来好多了,相对恒定而且没有趋势了 * 但残差对时间的图表现出非独立性 4. 残差是否有自相关性?
acf(residuals(pass.fit2))
线性相关系数, \(\rho\),通常被当做相关性,是衡量两个随机变量\(X 和 Y 之间的线性关系,其中 0<\rho<1\)
\(如果 \rho = 0 \,则X和Y是完全不相关\)
在简单线性回归中样本的相关系数: \[r = \rm sign(\beta_1) \sqrt{R^2}\]
这里的相关性是指 \(Y_t 相对于 Y_{t+h}\)的 其中: t=1,…, T-2.
第三条垂直线的高度是\(\rm Cor[Y_t, Y_{t+2}]\), lag-2自相关又称为二阶自相关,以此类推
蓝色的水平虚线是我们认为自相关性I显著的位置。大致来讲,如果垂直线超过虚线,则\(H_0: \rho_h = 0\) 的 P-value将会小于0.05。(由于多重比较,即使没有相关性,有一两个自相关值稍微超过虚线也是正常的)
回到前面如何识别自相关性的第四步
acf(residuals(pass.fit2))
* 这是明显的自相关模式,有很多垂直线超过了虚线。我们遇到了自相关问题。 * 要解决这个问题,我们明确建立一个一阶自相关模型,通过引进一个延迟响应变量作为解释变量到模型中来解决这个问题。 * 即:\(Y_1\)用来解释\(Y_2\),\(Y_2\)用来解释\(Y_3 \cdots\), 以此类推 * 这意味着我们不能使用\(Y_1\)作为相应变量,因为没有\(Y_0\)
log(passengers)[-1] 去掉第一项 * 这类模型称为延迟响应模型
pass.fit3 = lm( log(passengers)[-1] ~ time[-1] + month[-1] + I(time[-1]^2) + log(passengers)[-144], data = airpass.df)
anova(pass.fit3)
## Analysis of Variance Table
##
## Response: log(passengers)[-1]
## Df Sum Sq Mean Sq F value Pr(>F)
## time[-1] 1 24.4515 24.4515 19260.14 < 2.2e-16 ***
## month[-1] 11 2.2733 0.2067 162.79 < 2.2e-16 ***
## I(time[-1]^2) 1 0.1617 0.1617 127.35 < 2.2e-16 ***
## log(passengers)[-144] 1 0.1362 0.1362 107.25 < 2.2e-16 ***
## Residuals 128 0.1625 0.0013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(residuals(pass.fit3))